Nancy Fraser once described political representation as furnishing the stage where struggles for recognition and distribution occur. This stage, however, can be quite small. Brazil's democratic institutions have been historically criticized for failing to give voice to a range of social actors. The present study seeks to investigate whether Facebook reinforces, or subverts, this diagnosis.
Based on data about the 2022 election for Congress, I initially describe what kind of candidate did not campaign on that platform. The analyses indicate that most candidates who were absent from Facebook were “outsiders”: candidates of color with lower levels of education, whose campaigns used significantly less party resources, and previously had no elected or state jobs.
My analysis turns next to candidates who did have an active account on Facebook. In exploring what campaigns rose to prominence on that platform, I show that already belonging to Brazil’s political class and expenditure with Facebook promotion services predict greater likelihood of engagement on their posts. This finding reinforces, rather than subverts, existing diagnosis of elite closure, as the unequal distribution of campaign resources help sustain the country’s political class.
In addition to investigating inequalities in the "supply side," I also examine whether these candidates’ campaigns were equitably distributed across their constituencies. Although Brazilian politicians claim to speak on behalf of vast regions of the country, my analysis demonstrate that certain municipalities are targeted much more than others, which may give rise to regions of representative scarcity in Brazil.
library(tidyverse)
library(readr)
library(stringdist)
library(datasets)
library(janitor)
library(lubridate)
library(tidytext)
library(stringr)
library(tokenizers)
library(wordcloud)
library(textdata)
library(geobr)
library(ggplot2)
library(sf)
library(dplyr)
library(ggmap)
library(broom)
library(RColorBrewer)
library(viridis)
library(igraph)
library(ggraph)
library(widyr)
library(janeaustenr)
library("lexiconPT")
library(stringi)
library(purrr)
#Regression analysis
#install.packages("caret")
#install.packages("car")
#install.packages("leaps")
#install.packages("MASS")
#install.packages("partykit")
#install.packages("nnet")
#install.packages("EMT")
library(caret)
library(car)
library(leaps)
library(MASS)
library(partykit)
library(nnet)
library(EMT)
#DATA VISUALIZATION
#install.packages('maps')
#install.packages("tibble")
#install.packages("forcats")
#install.packages("plotly")
library(maps)
library(plotly)
library(tibble)
library(forcats)
#Facebook Ads
library(highcharter)
library(httr)
library(furrr)
library(Radlibrary)
library(metatargetr)
library(remotes)
#DATASETS
CrowdTangle <- readr::read_csv('(CART) (COLLECTTED 4-1-2023) CrowdTangle Historical-Report- Parliamentary Candidates MG - 2022-01-01-UNTIL-2023-01-02.csv') %>%
select(page_name, page_id, page_likes, page_followers, timestamp, body, body_description, views_total, interactions, comments, views_post, shares, likes, likes_wow, likes_love, likes_angry, likes_sad)
# Excluding posts created after the election.
CrowdTangle$timestamp <- as.Date(CrowdTangle$timestamp, format = "%m/%d/%Y")
CrowdTangle$postID<-row.names(CrowdTangle)
cutoff_date <- as.Date("2022-10-02")
# Subset the data frame to exclude posts before the cutoff date
CrowdTangle <- CrowdTangle[CrowdTangle$timestamp <= cutoff_date, ]
#This sample procedure yielded a total of 73,159 Facebook posts.
#Data on the municipalities of the state of Minas Gerais (MG)
MGMunicipalities <- readr::read_csv('ListMunicipalitiesIBGE.csv')
#Data on the campaign expenses
dfExpensesTotal <- readr::read_csv("dfCampaignExpenses.csv")%>%
select(SG_UF, DS_CARGO, NR_CANDIDATO, full_name, party, party_abbrev, NM_FORNECEDOR, DS_ORIGEM_DESPESA, DS_DESPESA, VR_DESPESA_CONTRATADA) %>%
filter(DS_CARGO == "Deputado Federal")
#Expenses grouped by candidate
dfExpenses <- dfExpensesTotal %>%
group_by(full_name) %>%
summarise(Expenses = sum(VR_DESPESA_CONTRATADA))
#Adding expenses with online content
dfExpenses_FB <- dfExpensesTotal %>%
filter(str_detect(DS_ORIGEM_DESPESA, "Despesa com Impulsionamento de Conteúdos", negate = F)) %>%
group_by(full_name) %>%
summarise(ExpensesAds = sum(VR_DESPESA_CONTRATADA))
dfExpenses <- left_join(dfExpenses, dfExpenses_FB, by="full_name")
I combined the TSE data set with information about the candidates’ communications on social media, specifically their account names, number of views and number of interactions.
#First, I create a new dataset combining all interactions these candidates received during the election
CrowdTangle_Engagement <- CrowdTangle %>%
group_by(page_name) %>%
summarize(N_Posts = n(),
Total_Views = sum(views_total),
Total_Interactions = sum(interactions),
Total_Comments = sum(comments),
Total_Post_Views = sum(views_post),
Total_Shares = sum(shares),
Total_Likes = sum(likes),
Total_Angy = sum(likes_angry))
#Had to combine manually this analysis with the previous TSE dataset.
write.csv2(CrowdTangle_Engagement, file = "CrowdTangle_Engagement.csv")
#The new dataset is:
dfTSE <- read_delim("df_TSE.csv", delim = ";",
escape_double = FALSE, trim_ws = TRUE)
df_combined <- left_join(dfTSE, CrowdTangle_Engagement, by="page_name")
#Translating the gender variables
df_combined$gender <- str_replace_all(df_combined$gender,'Masculino','Masculin')
df_combined$gender <- str_replace_all(df_combined$gender,'Feminino','Feminine')
#Translating the race variable
df_combined$race <- case_when(
df_combined$race == 'BRANCA' ~ 'White',
df_combined$race %in% c('PRETA', 'PARDA', 'AMARELA', 'INDÍGENA') ~ 'Non-White',
df_combined$race == 'SEM INFORMAÇÃO' ~ 'No information',
TRUE ~ df_combined$race
)
#Translating the education variable
df_combined$education <- case_when(
df_combined$education == 'Superior completo' ~ 'College Complete',
df_combined$education %in% c('Superior incompleto', 'Ensino Médio completo', 'Ensino Médio incompleto',
'Ensino Fundamental completo', 'Ensino Fundamental incompleto', 'Lê e escreve') ~ 'College incomplete or less',
TRUE ~ df_combined$education
)
#Translating (some) previous occupations
df_combined$occupation <- case_when(
df_combined$occupation == 'Gari ou Lixeiro' ~ 'Cleaner',
df_combined$occupation == 'Deputado' ~ 'Deputy',
df_combined$occupation == 'Ator e Diretor de Espetáculos Públicos' ~ 'Actor or Director of Public Spectacles',
df_combined$occupation == 'Vereador' ~ 'Member of City Council',
df_combined$occupation == 'Veterinário' ~ 'Veterinarian',
df_combined$occupation == 'Servidor Público Federal' ~ 'Public Servant',
df_combined$occupation == 'Policial Civil' ~ 'Police (Civil)',
df_combined$occupation == 'Jornalista e Redator' ~ 'Journalist',
df_combined$occupation == 'Advogado' ~ 'Lawyer',
df_combined$occupation == 'Psicólogo' ~ 'Psychologist',
df_combined$occupation == 'Fotógrafo e Assemelhados' ~ 'Photographer (and similar professions)',
df_combined$occupation == 'Pecuarista' ~ 'Rancher',
df_combined$occupation == 'Outros' ~ 'Other',
df_combined$occupation == 'Empresário' ~ 'Businessperson',
df_combined$occupation == 'Comerciante' ~ 'Merchant',
df_combined$occupation == 'Administrador' ~ 'Manager',
df_combined$occupation == "Aposentado (Exceto Servidor Público)" ~ 'Retired (except former public servants)',
df_combined$occupation == 'Agente Administrativo' ~ 'Administrative Agent',
df_combined$occupation == 'Agente de Saúde e Sanitarista' ~ 'Health and Sanitation Agent',
TRUE ~ df_combined$occupation
)
#Parsing spent and received funds
df_combined$received_funds <- parse_number(df_combined$received_funds)
df_combined$spent_funds <- parse_number(df_combined$spent_funds)
#Creating a metric for absolut engagement
df_combined$Absolut_Engagement = rowSums(df_combined[,c("Total_Post_Views", "Total_Interactions", "Total_Comments")])
#Creating a metric for relative engagement
df_combined$Relative_Engagement = rowSums(df_combined[,c("Total_Post_Views", "Total_Interactions", "Total_Comments")]) / df_combined$N_Posts
#Adding data about candidate's declared wealth and campaign expenses
Wealth <- read.csv("Wealth.csv") %>%
select(full_name, Bens_Declarados)
df <- left_join(df_combined, Wealth, by="full_name")
df <- left_join(df, dfExpenses, by="full_name")
Descriptive analysis of the entire sample
df_total <- df %>%
mutate(interaction = interaction(gender, race, education, sep = " ")) %>%
group_by(interaction) %>%
summarise(Composition = n(),
Expenses = mean(Expenses, na.rm = TRUE),
ExpensesAds = mean(ExpensesAds, na.rm = TRUE),
Wealth = mean(Bens_Declarados, na.rm = TRUE),
RelativeEngagement = mean(Relative_Engagement, na.rm = TRUE))
df_total_arranged <- df_total %>%
filter(!is.na(ExpensesAds)) %>%
filter(!is.na(Wealth)) %>%
filter(!is.na(RelativeEngagement))
df_total_arranged %>%
ggplot(aes(x = "", fct_reorder(interaction, ExpensesAds), y = fct_reorder(interaction, ExpensesAds))) +
geom_boxplot(fill = "slateblue", alpha = 0.2, notch = TRUE, fatten = 5) +
xlab("Expenditure on Facebook services") +
ylab("")
df_total_arranged %>%
ggplot(aes(x = "", fct_reorder(interaction, RelativeEngagement), y = fct_reorder(interaction, RelativeEngagement))) +
geom_boxplot(fill = "slateblue", alpha = 0.2, notch = TRUE, fatten = 5) +
xlab("Relative Engagement on Facebook") +
ylab("")
df_nas <- df[is.na(df$page_name), ] #Separating the dataset
nas_intersections <- df_nas %>%
mutate(interaction = interaction(gender, race, education, sep = " ")) %>%
group_by(interaction) %>%
summarise(Composition = n(),
Expenses = mean(Expenses, na.rm = TRUE),
Wealth = mean(Bens_Declarados, na.rm = TRUE)) %>%
arrange(desc(Wealth),by_group = TRUE)
nas_intersections
## # A tibble: 12 × 4
## interaction Composition Expenses Wealth
## <fct> <int> <dbl> <dbl>
## 1 Masculin Non-White College Complete 76 2.38e7 1.78e9
## 2 Feminine White College incomplete or less 50 1.45e7 1.28e9
## 3 Masculin White College incomplete or less 74 1.11e7 1.03e9
## 4 Masculin Non-White College incomplete or less 133 7.57e6 6.99e8
## 5 Masculin White College Complete 115 2.63e7 4.66e8
## 6 Feminine White College Complete 52 1.09e7 1.19e8
## 7 Feminine Non-White College incomplete or less 111 5.81e6 8.33e7
## 8 Feminine Non-White College Complete 41 8.49e6 3.78e7
## 9 Masculin No information College incomplete or … 3 3.75e6 2.61e7
## 10 Feminine No information College Complete 1 NaN NaN
## 11 Masculin No information College Complete 2 1.54e6 NaN
## 12 Feminine No information College incomplete or … 1 1.5 e6 NaN
Analysis: there were 1018 candidates who ran for the Chamber of Federal Deputies in 2022, 659 of them did not have an active, public account on Facebook. By analyzing the intersection of gender, race, and education, we can verify that the most numerous kind of candidate that was absent from this platform were men of color with lower levels of education (n = 133).
Specifically, we have:
| Gender | 256 women | 403 men |
| Race | 361 non-white | 291 white |
| Education | 287 with college complete | 372 college incomplete or less |
After describing the composition of this group of candidates, I explored what resources they had to campaign. To verify what profile received less support from their political parties, I computed the mean value of campaign expenses per group. Consistent with the description above, this analysis revealed that candidates of color with lower levels of education ran on campaigns with significant lower budget compared to other candidates. This finding suggests that campaign funding may be a determinant factor in the participation of candidates from minority backgrounds on Facebook.
More specifically, it is important to note that there were fewer female candidates on Facebook, and compared to their male counterparts, they had less party resources –specifically, women without a college degree ranked the lowest in campaign expenses. Women of color already constituted the group with lowest declared wealth in this sample, so they could not resort to personal funding as much as other candidates.
Next, I was interested in identifying the most frequent previous occupation of this group. This analysis reveals that most candidates who were absent from Facebook came from careers other than politics.
interactions_occupation <- df_nas %>%
group_by(occupation) %>%
summarise(Composition = n(),
Expenses = mean(Expenses, na.rm = TRUE))
interactions_occupation %>%
arrange(desc(Composition),by_group = TRUE)
## # A tibble: 98 × 3
## occupation Composition Expenses
## <chr> <int> <dbl>
## 1 Other 147 11460374.
## 2 Businessperson 72 20854973.
## 3 Lawyer 62 26179071.
## 4 Retired (except former public servants) 34 2825338
## 5 Merchant 27 4705759.
## 6 Manager 22 31177542.
## 7 Member of City Council 17 19103013.
## 8 Servidor Público Estadual 14 4013734.
## 9 Servidor Público Municipal 13 9229877.
## 10 Policial Militar 12 4544232.
## # ℹ 88 more rows
The analyses above indicate that most candidates who were absent from Facebook were “outsiders”, that is, candidates who previously had no elected or state job. They were predominantly candidates of color with lower levels of education, whose campaigns used significantly less party resources. We can say that a typical case of a candidate who was not on Facebook was that of a non-white businessman who never graduated from college. This candidate’s campaign expenses was less than half the average of the entire sample.
A significant proportion of female candidates did not have an active Facebook account (256 of a total of 343). This group ran their campaigns on lower budget and they could not resort to the same level of personal wealth as their male counterparts. Among those without a Facebook account, it is noteworthy that a woman of color without a college degree spent on average only fifty eight thousand on her campaign. For the sake of comparison, a white male candidate with higher education who was on Facebook spent eleven times more.
My literature review suggested that candidates would not compete on equal footing over their constituencies of interest. Traditional politicians —who are predominantly white, upper class, men— have historically received greater attention in the media. So I hypothesized that social media platforms would extend their visibility in ways that still reproduce inequalities in access to means of communication, conforming to what Strandberg (2013) described as normalization thesis. To verify that, I conducted the following analyzes: first, I counted the number of candidates in each demographic group and combined the number of interactions each group received during this election.
df_clean <- df %>%
filter(!is.na(N_Posts))
table(df_clean$education) #A similar code was used to verify the number of candidates of different gender and race.
##
## College Complete College incomplete or less
## 243 106
| Gender | 86 women | 265 men |
| Race | 146 non-white | 205 white |
| Education | 107 college incomplete or less | 244 college complete |
interactions_gender <- df_clean %>%
group_by(gender) %>%
summarise(n = n(), Engagement = sum(Absolut_Engagement), Mean = mean(Absolut_Engagement), sd = sd(Absolut_Engagement), Comments = sum(Total_Comments), Expenses = mean(Expenses, na.rm = T), ExpensesAds = mean(ExpensesAds, na.rm = TRUE))
interactions_gender %>%
arrange(desc(Mean),by_group = TRUE)
## # A tibble: 2 × 8
## gender n Engagement Mean sd Comments Expenses ExpensesAds
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Masculin 264 401476609 1520745. 9156049. 10129432 70478101. 5287173.
## 2 Feminine 85 35527619 417972. 1857266. 833309 52049764. 3647862.
interactions_race <- df_clean %>%
group_by(race) %>%
summarise(n = n(), Engagement = sum(Absolut_Engagement), Mean = mean(Absolut_Engagement), sd = sd(Absolut_Engagement), Comments = sum(Total_Comments), Expenses = mean(Expenses, na.rm = T), ExpensesAds = mean(ExpensesAds, na.rm = TRUE))
interactions_race %>%
arrange(desc(Mean),by_group = TRUE)
## # A tibble: 2 × 8
## race n Engagement Mean sd Comments Expenses ExpensesAds
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 White 205 282905008 1380024. 8535691. 7262158 78363977. 5593251.
## 2 Non-White 144 154099220 1070133. 7263727. 3700583 46396467. 3259382.
interactions_education <- df_clean %>%
group_by(education) %>%
summarise(n = n(), Engagement = sum(Absolut_Engagement), Mean = mean(Absolut_Engagement), sd = sd(Absolut_Engagement), Comments = sum(Total_Comments), Expenses = mean(Expenses, na.rm = T), ExpensesAds = mean(ExpensesAds, na.rm = TRUE))
interactions_education %>%
arrange(desc(Mean),by_group = TRUE)
## # A tibble: 2 × 8
## education n Engagement Mean sd Comments Expenses ExpensesAds
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 College incomple… 106 142032071 1.34e6 8.40e6 3311378 4.35e7 5021695.
## 2 College Complete 243 294972157 1.21e6 7.87e6 7651363 7.48e7 4792408.
# Create the interaction term by combining the categorical variables
df_interaction <- df_clean %>%
mutate(interactionRG = interaction(race, gender, sep = " ")) %>%
mutate(interactionGE = interaction(gender, education, sep = " ")) %>%
mutate(interaction = interaction(gender, race, education, sep = " "))
intersectional_analysis <- df_interaction %>%
group_by(interaction) %>%
summarise(n = n(), Engagement_Total = sum(Absolut_Engagement), Engagement_Mean = mean(Absolut_Engagement), sd = sd(Absolut_Engagement), Comments = sum(Total_Comments), Expenses = mean(Expenses, na.rm = T), ExpensesAds = mean(ExpensesAds, na.rm = TRUE))
intersectional_analysis %>%
arrange(desc(Engagement_Mean),by_group = TRUE)
## # A tibble: 8 × 8
## interaction n Engagement_Total Engagement_Mean sd Comments Expenses
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Masculin Non-… 47 95660791 2035336. 1.24e7 2516281 3.67e7
## 2 Masculin Whit… 122 235081713 1926899. 1.09e7 6432463 9.46e7
## 3 Masculin Whit… 39 42311672 1084915. 2.95e6 725435 6.17e7
## 4 Feminine Non-… 32 27479623 858738. 2.97e6 685377 6.17e7
## 5 Masculin Non-… 56 28422433 507543. 1.80e6 455253 4.69e7
## 6 Feminine Non-… 9 2536373 281819. 7.58e5 43672 3.95e7
## 7 Feminine Whit… 11 1523235 138476. 2.71e5 25990 1.52e7
## 8 Feminine Whit… 33 3988388 120860. 1.99e5 78270 6.02e7
## # ℹ 1 more variable: ExpensesAds <dbl>
occupation_interactions <- df_clean %>%
group_by(occupation) %>%
summarise(n = n(), Engagement = sum(Absolut_Engagement), Mean = mean(Absolut_Engagement), Expenses = mean(Expenses, na.rm = T), ExpensesAds = mean(ExpensesAds, na.rm = TRUE))
occupation_interactions %>%
arrange(desc(Mean),by_group = TRUE)
## # A tibble: 68 × 6
## occupation n Engagement Mean Expenses ExpensesAds
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Cleaner 2 84736263 4.24e7 1.07e7 4000000
## 2 Deputy 33 174381178 5.28e6 2.21e8 11190643.
## 3 Actor or Director of Public Spe… 3 14576219 4.86e6 4.18e7 9687750
## 4 Member of City Council 32 86208501 2.69e6 7.25e7 4417140.
## 5 Photographer (and similar profe… 1 1749343 1.75e6 2.91e8 17500000
## 6 Journalist 6 6835541 1.14e6 6.87e7 5460000
## 7 Lawyer 36 31831931 8.84e5 7.66e7 5760613.
## 8 Rancher 1 782255 7.82e5 1.60e7 3700000
## 9 Veterinarian 2 1325929 6.63e5 2.12e8 5193500
## 10 Servidor Público Civil Aposenta… 3 1518381 5.06e5 6.48e7 4148986.
## # ℹ 58 more rows
In absolute numbers, most interactions that occurred on Facebook were unequally distributed: they were directed to white, male candidates with higher levels of education. Considering the low number of female candidates on this platform (86, only ten of which were women of color without a college degree), it is not surprising that we found this distribution in absolute numbers. Yet the same distribution was found when calculating the average engagement per composition of the groups (except for education). Therefore, gender appears to be an important factor in this analysis, as most online interactions occurred with men.
If we consider the intersection of our variables of interest, the groups that received the most interactions online were composed of man of color without college degree (even though their campaigns ran on relatively low budget), followed by white man of all levels of education.
Parliamentarians seeking re-election and former members of municipal councils ranked high in this analysis, which corroborates with previous studies suggesting that Brazilian elections for Congress favor incumbents and those already belonging to the country’s political class. (The other previous occupations that figured on top were cleaner, actor, and photographer, but there were only three or less candidates from each of these occupations.)
interactions_party <- df_clean %>%
group_by(party_abbrev) %>%
summarise(n = n(), Engagement = sum(Absolut_Engagement), Mean = mean(Absolut_Engagement), Comments = sum(Total_Comments), Expenses = mean(Expenses, na.rm = T), ExpensesAds = mean(ExpensesAds, na.rm = TRUE))
interactions_party %>%
arrange(desc(Mean),by_group = TRUE)
## # A tibble: 30 × 7
## party_abbrev n Engagement Mean Comments Expenses ExpensesAds
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AVANTE 30 192297519 6409917. 6936781 54349481. 3596254.
## 2 PL 20 78840738 3942037. 692163 73159389. 4982286.
## 3 PTB 9 15062984 1673665. 384665 55714540. 9341895.
## 4 PT 28 46333715 1654776. 1627701 100941625. 6147906.
## 5 REPUBLICANOS 20 22103616 1105181. 428653 65512870. 3729568.
## 6 PP 17 16032415 943083. 260934 118822675. 3706815.
## 7 UNIÃO 17 14808177 871069. 88230 112611188. 6018346.
## 8 PATRIOTA 20 16541623 827081. 102276 82878097. 7895647.
## 9 PSDB 13 6760774 520060. 45728 110666903. 10256125
## 10 SOLIDARIEDADE 13 5891270 453175. 131087 36868543. 1756597.
## # ℹ 20 more rows
My next step was to examine whether gender, race, education helped explain what candidates received more interactions online. To that end, I included variables about their campaign budget, specifically: the amount of money they received to campaign and the amount of money they declared to have used.
CorrelationFundsEngagement <- cor.test(df_clean$ExpensesAds, df_clean$Relative_Engagement,
method = "pearson")
CorrelationFundsEngagement
##
## Pearson's product-moment correlation
##
## data: df_clean$ExpensesAds and df_clean$Relative_Engagement
## t = 5.8382, df = 237, p-value = 1.727e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2384247 0.4607545
## sample estimates:
## cor
## 0.3545916
BV_Engagement <- ggplot()+
geom_jitter(data=df_clean, aes(x = ExpensesAds, y = Relative_Engagement, color = party_abbrev, text=name_ballot)) +
labs(title = "Relationship between online campaign expenses and engagement", x = "Expenses", y = "Engagement online")+
guides(col = guide_legend(title = "", ncol = 1))
## Warning in geom_jitter(data = df_clean, aes(x = ExpensesAds, y =
## Relative_Engagement, : Ignoring unknown aesthetics: text
InteractiveEngagement <- ggplotly(BV_Engagement)
InteractiveEngagement
hist(df_clean$Relative_Engagement,
breaks = 100, # Choose the number of bins automatically using the 'FD' method
xlim = c(0, 20000),
col = "skyblue", # Set the color of the bars
border = "white", # Set the color of the bar borders
xlab = "Relative Engagement", # Set the x-axis label
main = "Histogram of Relative Engagement" # Set the main title
)
Given the highly skewed distributions of reaction data, the engagement variables (absolut and relative) were transformed using logarithmic 10.
#Transforming the data
df_regression_log <- df_clean
df_regression_log <- df_regression_log %>%
mutate(interaction = interaction(race, gender, education, sep = " "))
#Adding political class
df_regression_log <- df_regression_log %>%
mutate(PoliticalClass = df_regression_log$occupation)
df_regression_log$PoliticalClass <- case_when(
df_regression_log$PoliticalClass %in% c('Deputy', 'Member of City Council') ~ '1', TRUE ~ '0')
#Removing NAs
df_regression_log$ExpensesAds[is.na(df_regression_log$ExpensesAds)] <- 0
#Transforming categorical into interger
df_regression_log$gender <- str_replace_all(df_regression_log$gender,
'Feminine','0')
df_regression_log$gender <- str_replace_all(df_regression_log$gender,
'Masculin','1')
df_regression_log$race <- str_replace_all(df_regression_log$race,
"Non-White", "0")
df_regression_log$race <- str_replace_all(df_regression_log$race,
"White", "1")
df_regression_log$education <- str_replace_all(df_regression_log$education,
'College incomplete or less','0')
df_regression_log$education <- str_replace_all(df_regression_log$education,
'College Complete','1')
df_regression_log$gender <- as.integer(df_regression_log$gender)
df_regression_log$race <- as.integer(df_regression_log$race)
df_regression_log$education <- as.integer(df_regression_log$education)
df_regression_log$PoliticalClass <- as.integer(df_regression_log$PoliticalClass)
df_regression_log$interaction <- as.integer(df_regression_log$interaction)
#Log transformation of Relative Engagement
df_regression_log$Relative_Engagement <- log(df_regression_log$Relative_Engagement + 5)
df_regression_log$Absolut_Engagement <- log(df_regression_log$Absolut_Engagement + 5)
df_regression_log$Relative_Engagement <- as.integer(df_regression_log$Relative_Engagement)
df_regression_log$Absolut_Engagement <- as.integer(df_regression_log$Absolut_Engagement)
hist(df_regression_log$Absolut_Engagement,
col = "skyblue", # Set the color of the bars
border = "white", # Set the color of the bar borders
xlab = "Absolut Engagement", # Set the x-axis label
main = "Histogram of Absolut Engagement" # Set the main title
)
Binomial Regression Model
Because my dependent variable, “relative engagement,” is a count with an over-dispersed distribution, the most appropriate statistical test is a negative binomial regression.
# Negative binomial regression model
Model1 <- glm.nb(Absolut_Engagement ~ gender + race + education + PoliticalClass
+ ExpensesAds + Expenses + Bens_Declarados,
data = df_regression_log)
#Ombinus test
#omnibus <- Anova(Model1, type = "III")
#omnibus
Ombinus Test | Analysis of Deviance Table (Type III tests)
| LR Chisq | Df | Pr (>Chisq) | |
|---|---|---|---|
| gender | 0.13 | 1 | 0.72 |
| race | 2.19 | 1 | 0.14 |
| education | 1.72 | 1 | 0.19 |
| PoliticalClass | 8.15 | 1 | 0.00 (***) |
| ExpensesAds | 12.30 | 1 | 0.00 (***) |
| Expenses | 17.66 | 1 | 2.62 (***) |
| Wealth | 0.33 | 1 | 0.56 |
The result of the omnibus test showed that expenses with Facebook services [χ2 = 12.30, p<0.001], expenses with the campaign in general [χ2 = 17.66, p <0.001] and belonging to the political class [χ2 = 8.15, p<0.001] significantly improved model fit over a null model.
# Check for multicollinearity using vif()
vif_results <- vif(Model1)
print(vif_results)
## gender race education PoliticalClass ExpensesAds
## 1.022221 1.057075 1.042680 1.386069 1.358037
## Expenses Bens_Declarados
## 1.722170 1.018314
Next, I assessed multicollinearity in the model. All the Variance Inflation Factors (VIF) values were close to 1, which suggests that there is no severe multicollinearity between these variables.
# Print the summary of the model
summary(Model1)
##
## Call:
## glm.nb(formula = Absolut_Engagement ~ gender + race + education +
## PoliticalClass + ExpensesAds + Expenses + Bens_Declarados,
## data = df_regression_log, init.theta = 135376.8946, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2304 -0.6514 0.0886 0.5735 3.1169
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.981e+00 6.129e-02 32.315 < 2e-16 ***
## gender 1.727e-02 4.795e-02 0.360 0.718748
## race 6.440e-02 4.360e-02 1.477 0.139648
## education 6.265e-02 4.798e-02 1.306 0.191633
## PoliticalClass 1.502e-01 5.229e-02 2.873 0.004065 **
## ExpensesAds 8.980e-09 2.498e-09 3.595 0.000324 ***
## Expenses 1.101e-09 2.588e-10 4.254 2.1e-05 ***
## Bens_Declarados 2.082e-12 3.564e-12 0.584 0.559066
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(135376.9) family taken to be 1)
##
## Null deviance: 378.62 on 268 degrees of freedom
## Residual deviance: 264.57 on 261 degrees of freedom
## (80 observations deleted due to missingness)
## AIC: 1364.8
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 135377
## Std. Err.: 1309109
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -1346.838
######################################AAAAAAAAAAASSK!!!!
# Extract coefficient estimates
#Formula to calculate the IRR for "Expenses":
#IRR = exp(coefficient estimate)
IRR_Expenses = exp(1.101e-09)
IRR_ExpensesAds = exp(8.980e-09)
IRR_PoliticalClass = exp(1.502e-01)
Reporting results from model 1 | The negative binomial regression showed that already belonging to Brazil’s political class (β=1.50, IRR = ≈ 1.16, p <0.01) , campaign expenses (β=1.10, IRR = ≈ 1.00, p <0.001) and, more specifically, expenses with Facebook services (β=8.98, IRR = ≈ 1.00, p <0.001) had a significant relationship with online engagement.
What are the constituencies (geographically defined) parliamentarians reached out to the most? Conversely, what constituencies were left aside in this election?
My literature review also stated that I do not presume that information and propaganda, our economy of claims’ main products, are equitably distributed across any given state. I expect, contrarily, that such an economy would suffer from fundamental scarcities in the distribution of its product.
In order to test that, I studied whether there were constituencies which received less attention during this electoral period, thus being left aside in these campaigners’ political communication.
My first step was to count the number of times any given municipality in the state of Minas Gerais was mentioned in the posts collected from CrowdTangle.
# All municipalities in the state of Minas Gerais
muni <- read_municipality(code_muni= "MG",
year=2020,
showProgress = FALSE)
####Transforming Latin/Portuguese characters
## CrowdTangle
CrowdTangle_Clean <- CrowdTangle
CrowdTangle_Clean$body = stri_trans_general(str = CrowdTangle_Clean$body, id = "Latin-ASCII") #removing accents
CrowdTangle_Clean$body <- tolower(CrowdTangle_Clean$body) #lower case
##Municipality names
muni_clean <- muni
muni_clean$name_muni = stri_trans_general(str = muni_clean$name_muni, id = "Latin-ASCII")
muni_clean$name_muni <- tolower(muni_clean$name_muni)
################################
#Tokenization and identification
#First, we need to check whether each element in the body variable contains any of the municipality names.
posts_with_muni <- CrowdTangle_Clean %>%
filter(
str_detect(body, paste(muni_clean$name_muni, collapse = "|"), negate = F)
) #The paste() function concatenates the names into a single string, with the collapse argument set to | to create a regular expression pattern that matches any of the names.
#The str_detect() function returns a logical vector indicating whether each element in body matches the pattern. The negate = F argument ensures that the matches are not negated (i.e., it returns TRUE for matching elements).
#This procedure identified 29,536 posts.
#Now, I will identify the occurences in the corpus (both across the dataset and per candidate). To that end, the first step will be to extract all occurences of a pattern from a string.
muni2 <- str_extract_all(
CrowdTangle_Clean$body,
paste(muni_clean$name_muni, collapse = "|"), #Paste() function concatenates all elements from the muni_clean$name_muni vector into a single string. The collapse argument is set to | to create a regular expression pattern that matches any of the names.
simplify = FALSE)
muni_list <- muni2 %>%
unlist() %>% #unlist produces a vector which contains all the atomic components which occur in a column.
as.data.frame()
colnames(muni_list) <- "name_muni" #pulling out the names of the columns (changing . for something meaningful)
muni_mentioned <- muni_list %>%
group_by(name_muni) %>%
count()
#Adding the number of mentions to the datalist
muni_clean <- as.data.frame(muni_clean) #because R considered it a "large list", not a dataframe.
muni3 <- left_join(muni_clean, muni_mentioned, by = "name_muni", copy = FALSE, keep = FALSE)
muni3$freq <- muni3$freq %>% replace_na(0)
####################################################
######## REPLACE AMBIGUOUS NAMES
# TO BE DONE, 300 IS A TEST
muni3 <- muni3 %>%
mutate(n = ifelse(name_muni == "campanha", 300, n))
# Filter out "campanha" from muni_clean$name_muni for the next analyses
filtered_muni_names <- muni_clean$name_muni[!grepl("campanha", muni_clean$name_muni)]
####################################################
######## #UNIQUE MENTIONS PER CANDIDATE
# Calculate the total mentions excluding "campanha"
muni_total_candidate <- CrowdTangle_Clean %>%
group_by(page_name) %>%
summarize(unique_mentions = sum(str_count(body,
paste(filtered_muni_names, collapse = "|"))))
#This last line calculates the counts of municipality names for each page_name.
#Str_count() is used to count the number of occurrences of the municipality names in the body column; sum() function is applied to get the total count.
muni_total_candidate[is.na(muni_total_candidate)] <- 0
####################################################
########
#Total mentions per candidate
muni_unique_candidate <- CrowdTangle_Clean %>%
mutate(municipalities = str_extract_all(body,
paste(filtered_muni_names,
collapse = "|"))) %>%
#The mutate line adds a new column called municipalities to the dataset. This column contains lists of the mentioned municipalities extracted from the body column using str_extract_all().
group_by(page_name) %>%
summarize(total_mentions = sum(lengths(municipalities)),
#Lengths() is used to get the occurencies *from* the lists in the municipalities column and sum() to calculate the sum of those counts.
mentioned_municipalities = toString(unique(unlist(municipalities))))
##toString() to create the comma-separated string.
################
################
#### ???? ####
#### ???? ####
#### ???? ####
################
################
Questions:
Why are the code above (unique vs. total) producing results that are so different?
The codes above failed to indicate the number of cities these candidates mentioned in their posts (not the total amount of mentions).
Regarding ambiguous names, the best approach would be to manually identify their occurrences + change to another name (e.g. “Campanha” to “Municipio de Campanha) so that the codes above can identify it?
Adding Census Data
MGMunicipalities_Clean <- MGMunicipalities
MGMunicipalities_Clean$name_muni = stri_trans_general(str = MGMunicipalities_Clean$name_muni, id = "Latin-ASCII") #removing accents
MGMunicipalities_Clean$name_muni <- tolower(MGMunicipalities_Clean$name_muni) #lower case
IBGE <- left_join(muni3, MGMunicipalities_Clean, by = "name_muni", copy = T, keep = FALSE)
IBGE <- replace(IBGE, is.na(IBGE), 0)
How many municipalities were mentioned less than ten times?
sum(IBGE$n <10, na.rm=TRUE)
## [1] 254
What is the average number of municipalities mentioned by a single candidate?
mean(muni_total_candidate$unique_mentions)
## [1] 16.71866
There were 29,536 posts that made reference to MG municipalities during the last election. On average, candidates made reference to XX municipalities in their political communication.
CorrelationPopulation <- cor.test(IBGE$n, IBGE$Population_2021,
method = "pearson")
#Correlation between mentions and Human Development Index (IDHM, in Portuguese)
CorrelationIDHM <- cor.test(IBGE$n, IBGE$IDHM,method = "pearson")
CorrelationPIB <- cor.test(IBGE$n, IBGE$PIB_per_capita, method = "pearson")
CorrelationPopulation
##
## Pearson's product-moment correlation
##
## data: IBGE$n and IBGE$Population_2021
## t = 20.332, df = 851, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5248037 0.6152955
## sample estimates:
## cor
## 0.5717862
CorrelationIDHM
##
## Pearson's product-moment correlation
##
## data: IBGE$n and IBGE$IDHM
## t = 7.625, df = 851, p-value = 6.521e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1889682 0.3146692
## sample estimates:
## cor
## 0.2528856
CorrelationPIB
##
## Pearson's product-moment correlation
##
## data: IBGE$n and IBGE$PIB_per_capita
## t = 4.2311, df = 851, p-value = 2.578e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07715752 0.20865394
## sample estimates:
## cor
## 0.1435392
ggplot(data = IBGE) +
geom_jitter() +
aes(x = n, y = IDHM, color = Mesorregiao)+
labs(title = "Relationship between mentions and Human Development Index", x = "Mentions", y = "Human Development Index")
ggplot(data = IBGE) +
geom_jitter() +
aes(x = n, y = Population_2021, color = Mesorregiao)+
labs(title = "Relationship between mentions and population", x = "Mentions", y = "Population")
ggplot(data = IBGE) +
geom_jitter() +
aes(x = n, y = PIB_per_capita, color = Mesorregiao)+
labs(title = "Relationship between mentions and PIB", x = "Mentions", y = "PIB per capita")
BV <- ggplot()+
geom_jitter(data=IBGE, aes(x = n, y = Population_2021, text=name_muni)) +
labs(title = "Relationship between mentions and population (estimated for 2021)", x = "Mentions", y = "population")+
guides(fill = guide_colourbar(title = "Mentions"))
InteractivePopulation <- ggplotly(BV)
InteractivePopulation
BV + facet_wrap(vars(Mesorregiao))
ggplot(data = IBGE) +
geom_jitter() +
aes(x = n, y = PIB_per_capita, color = Mesorregiao)+
labs(title = "Relationship between PIB per capita and mentions", x = "Mentions", y = "PIB per capita")
hist(IBGE$n,
breaks = 100, # Choose the number of bins automatically using the 'FD' method
col = "skyblue", # Set the color of the bars
border = "white", # Set the color of the bar borders
ylab = "",
xlab = "", # Set the x-axis label
main = "Histogram of Mentions" # Set the main title
)
IBGE_log <- IBGE
#Log transformation of number of mentions
IBGE_log$n <- log(IBGE_log$n + 1)
hist(IBGE_log$n,
col = "skyblue", # Set the color of the bars
border = "white", # Set the color of the bar borders
xlab = "", # Set the x-axis label
main = "Histogram of Mentions" # Set the main title
)
Model_Geo <- lm(n ~ Population_2021 + IDHM + PIB_per_capita,
data = IBGE_log)
omnibus_geo <- Anova(Model_Geo, type = "III")
omnibus_geo
## Anova Table (Type III tests)
##
## Response: n
## Sum Sq Df F value Pr(>F)
## (Intercept) 11.61 1 7.4126 0.00661 **
## Population_2021 112.07 1 71.5384 < 2e-16 ***
## IDHM 125.06 1 79.8326 < 2e-16 ***
## PIB_per_capita 7.41 1 4.7322 0.02988 *
## Residuals 1329.96 849
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for multicollinearity using vif()
vif_geo <- vif(Model_Geo)
print(vif_geo)
## Population_2021 IDHM PIB_per_capita
## 1.052033 1.153857 1.105105
summary(Model_Geo)
##
## Call:
## lm(formula = n ~ Population_2021 + IDHM + PIB_per_capita, data = IBGE_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6402 -0.6919 -0.0019 0.6531 4.3176
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.206e+00 4.429e-01 -2.723 0.00661 **
## Population_2021 3.657e-06 4.324e-07 8.458 < 2e-16 ***
## IDHM 6.098e+00 6.825e-01 8.935 < 2e-16 ***
## PIB_per_capita 3.974e-08 1.827e-08 2.175 0.02988 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.252 on 849 degrees of freedom
## Multiple R-squared: 0.2105, Adjusted R-squared: 0.2077
## F-statistic: 75.47 on 3 and 849 DF, p-value: < 2.2e-16
Based on the number of times municipalities mineiras were mentioned, I plotted an interactive choropleth map. The map allows the reader to hoover over the municipalities and see their names and the number of times they were mentioned.
# Ploting an interactive choropleth map
Map <- ggplot() +
geom_sf(data=IBGE, aes(fill=n, text = name_muni, geometry = geom), color= NA, size=.15) +
labs(title="2022 Election: municipalities mentioned", size=10) +
guides(fill = guide_colourbar(title = "Mentions")) +
theme_minimal() +
theme(axis.line = element_blank(), axis.text = element_blank(),
axis.ticks = element_blank(), axis.title = element_blank(),
panel.grid = element_blank()) +
scale_fill_viridis(option="viridis")
InteractiveMap <- ggplotly(Map) %>%
layout(title = list(text = paste0("<B>2022 Election: municipalities mentioned</B>")))
InteractiveMap